import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print("Libraries successfully")
import warnings
warnings.filterwarnings("ignore")
dtypes = {
'MachineIdentifier': 'category',
'ProductName': 'category',
'EngineVersion': 'category',
'AppVersion': 'category',
'AvSigVersion': 'category',
'IsBeta': 'int8',
'RtpStateBitfield': 'float16',
'IsSxsPassiveMode': 'int8',
'DefaultBrowsersIdentifier': 'float16',
'AVProductStatesIdentifier': 'float32',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'HasTpm': 'int8',
'CountryIdentifier': 'int16',
'CityIdentifier': 'float32',
'OrganizationIdentifier': 'float16',
'GeoNameIdentifier': 'float16',
'LocaleEnglishNameIdentifier': 'int8',
'Platform': 'category',
'Processor': 'category',
'OsVer': 'category',
'OsBuild': 'int16',
'OsSuite': 'int16',
'OsPlatformSubRelease': 'category',
'OsBuildLab': 'category',
'SkuEdition': 'category',
'IsProtected': 'float16',
'AutoSampleOptIn': 'int8',
'PuaMode': 'category',
'SMode': 'float16',
'IeVerIdentifier': 'float16',
'SmartScreen': 'category',
'Firewall': 'float16',
'UacLuaenable': 'float32',
'Census_MDC2FormFactor': 'category',
'Census_DeviceFamily': 'category',
'Census_OEMNameIdentifier': 'float16',
'Census_OEMModelIdentifier': 'float32',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'float16',
'Census_ProcessorModelIdentifier': 'float16',
'Census_ProcessorClass': 'category',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_PrimaryDiskTypeName': 'category',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'category',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_PowerPlatformRoleName': 'category',
'Census_InternalBatteryType': 'category',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSVersion': 'category',
'Census_OSArchitecture': 'category',
'Census_OSBranch': 'category',
'Census_OSBuildNumber': 'int16',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'category',
'Census_OSSkuName': 'category',
'Census_OSInstallTypeName': 'category',
'Census_OSInstallLanguageIdentifier': 'float16',
'Census_OSUILocaleIdentifier': 'int16',
'Census_OSWUAutoUpdateOptionsName': 'category',
'Census_IsPortableOperatingSystem': 'int8',
'Census_GenuineStateName': 'category',
'Census_ActivationChannel': 'category',
'Census_IsFlightingInternal': 'float16',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'category',
'Census_ThresholdOptIn': 'float16',
'Census_FirmwareManufacturerIdentifier': 'float16',
'Census_FirmwareVersionIdentifier': 'float32',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsWIMBootEnabled': 'float16',
'Census_IsVirtualDevice': 'float16',
'Census_IsTouchEnabled': 'int8',
'Census_IsPenCapable': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'float16',
'HasDetections': 'int8'
}
print("Data types added")
df = pd.read_csv("../Data/train_clean.csv",dtype=dtypes)
df.head()
df.drop(columns="Unnamed: 0",inplace=True)
df.head()
df.shape
df.info()
def listRemove(list_name,col):
try:
list_name.remove(col)
except ValueError:
print("Issue while removing column")
pass
return list_name
#!conda create -n jupyterlab-debugger -c conda-forge jupyterlab=3 xeus-python
#!conda activate jupyterlab-debugger
def columnsTypesSeggregation(df):
boolean_cols,cat_cols,num_cols = [],[],[]
for column in list(df.columns):
if df[column].nunique() == 2.0:
boolean_cols.append(column)
print("No of Boolean Columns = "+str(len(boolean_cols)))
rem_cols = list([elem for elem in list(df.columns) if elem not in boolean_cols ])
#print("no of rem cols = "+str(len(rem_cols)))
for col in rem_cols:
if df[col].dtype in ['float16','int8','float32','int16']:
num_cols.append(col)
else :
cat_cols.append(col)
print("No of Category columns = "+str(len(cat_cols)))
print("No of Number columns = "+str(len(num_cols)))
return boolean_cols,cat_cols,num_cols
print("seggregation column ready to run")
boolean_cols,cat_cols,num_cols=columnsTypesSeggregation(df)
boolean_cols.remove("HasDetections")
boolean_cols
cat_cols
num_cols
def boolean_cols_eda(df,column,dependent="HasDetections"):
eda_df = df[[column,dependent]].groupby([column,dependent]).size().unstack(fill_value=0)
eda_df = pd.DataFrame({"No Malware detected":eda_df[0],"Malware Detected":eda_df[1]})
return eda_df
#Let's make a function for all boolean to calculated this way.
print("\nBoolean columns Summary")
for col in boolean_cols:
eda_df = boolean_cols_eda(df,col)
print("\n\n "+col+" Vs HasDetections\n\n")
colors = ['#2ca02c','#d62728']
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(15,7))
eda_df.iloc[0][["No Malware detected","Malware Detected"]].plot(kind='bar',ax=ax[0],color=colors)
ax[0].set_title("When '"+col+"' is 0")
eda_df.iloc[1][["No Malware detected","Malware Detected"]].plot(kind='bar',ax=ax[1],color=colors)
ax[1].set_title("When '"+col+"' is 1")
fig.tight_layout(pad=3.0)
plt.show()
eda_df["Total of each category"] = eda_df.sum(axis=1)
eda_df["% of Malware detected"] = (eda_df["Malware Detected"]*100)/(eda_df["Malware Detected"]+eda_df["No Malware detected"])
print(eda_df)
print("\nBoolean columns Summary")
def cat_cols_eda(df,column,dependent="HasDetections"):
eda_df = df[[column,dependent]].groupby([column,dependent]).size().unstack(fill_value=0)
eda_df = pd.DataFrame({"No Malware detected":eda_df[0],"Malware Detected":eda_df[1]})
return eda_df
for col in cat_cols:
print("\n\n "+col+" Vs HasDetections\n\n")
eda_df = cat_cols_eda(df,col)
eda_df["Total of each category"] = eda_df.sum(axis=1)
eda_df["% of Malware detected"] = (eda_df["Malware Detected"]*100)/(eda_df["Malware Detected"]+eda_df["No Malware detected"])
eda_df=eda_df[eda_df["Total of each category"] > 100 ]
eda_df.sort_values(by="% of Malware detected",ascending=False,inplace=True)
fig,ax=plt.subplots(1,2,figsize=(20,8))
temp1=eda_df[eda_df["% of Malware detected"] > 52.0]
if temp1.shape[0] > 0:
temp1[["No Malware detected","Malware Detected"]].plot(kind='bar',ax=ax[0])
ax[0].set_title("Most Vulnerable categories in '"+col+"' - high percentages in malware detected")
temp2=eda_df[eda_df["% of Malware detected"] < 48.0]
if temp2.shape[0] > 0 :
temp2[["No Malware detected","Malware Detected"]].plot(kind='bar',ax=ax[1])
ax[1].set_title("Safest categories in '"+col+"' - lowest percentages in malware detected")
plt.show()
print(eda_df)
print("\nCategory Columns Summary")
num_cols
data_temp = df.sample(20000)
for col in num_cols:
fig,ax = plt.subplots(1,2,figsize=(10,7))
sns.boxplot(y=col,x='HasDetections',data=data_temp,ax=ax[0])
sns.kdeplot(x=data_temp[data_temp.HasDetections==1][col],ax=ax[1])
sns.kdeplot(x=data_temp[data_temp.HasDetections==0][col],ax=ax[1])
ax[1].legend(("Malware Detected","No Malware Detected"),loc ='best')
plt.show()
for i,col in enumerate(num_cols):
sns.scatterplot(x=col,y=num_cols[i-1],hue="HasDetections",data=df.sample(10000))
#plt.axis((np.mean(col)-3.0*np.mean(col)),(np.mean(col)+3.0*np.mean(col)),(np.mean(num_cols)-3.0*np.mean(num_cols)),(np.mean(num_cols)+3.0*np.mean(num_cols)))
plt.show()
print("Bi variant numeric columns")
There are few columns despite seggregation of column types are present in numeric eventhough they are categorical columns. let's move them to category columns and run category functions to get insights.
new_cat_columns = ['OsSuite','OrganizationIdentifier','Wdft_RegionIdentifier',
'Census_FirmwareManufacturerIdentifier','Census_ProcessorManufacturerIdentifier','Census_ProcessorCoreCount',
'AVProductsInstalled','AVProductsEnabled','RtpStateBitfield','UacLuaenable','Census_OSBuildNumber','OsBuild','OsSuite']
new_cat_columns
for col in new_cat_columns:
print("\n\n "+col+" Vs HasDetections\n\n")
eda_df = cat_cols_eda(df,col)
eda_df["Total of each category"] = eda_df.sum(axis=1)
eda_df["% of Malware detected"] = (eda_df["Malware Detected"]*100)/(eda_df["Malware Detected"]+eda_df["No Malware detected"])
eda_df=eda_df[eda_df["Total of each category"] > 100 ]
eda_df.sort_values(by="% of Malware detected",ascending=False,inplace=True)
fig,ax=plt.subplots(1,2,figsize=(20,8))
temp1=eda_df[eda_df["% of Malware detected"] > 52.0]
if temp1.shape[0] > 0:
temp1[["No Malware detected","Malware Detected"]].plot(kind='bar',ax=ax[0])
ax[0].set_title("Most Vulnerable categories in '"+col+"' - high percentages in malware detected")
temp2=eda_df[eda_df["% of Malware detected"] < 48.0]
if temp2.shape[0] > 0 :
temp2[["No Malware detected","Malware Detected"]].plot(kind='bar',ax=ax[1])
ax[1].set_title("Safest categories in '"+col+"' - lowest percentages in malware detected")
plt.show()
print(eda_df)
#Delete new cat columns from num cols for further analysis
rem_num_cols = list(set(num_cols)-set(new_cat_columns))
rem_num_cols
finite_data = df[np.isfinite(df[col])==True]
temp_data=finite_data[rem_num_cols +['HasDetections']].sample(1000)
#temp_data = sns.load_dataset(temp_data)
snspair=sns.pairplot(data=temp_data,hue='HasDetections',height=15)
snspair.savefig('../Images/pairplot.png')
snspair